# Importing all the required libraries
library(tidyverse)
library(ggplot2)
library(plotly)
library(ipumsr)
library(haven)
library(splitstackshape)
# A. Loading the data and creating a dataframe
df_ddi <- read_ipums_ddi("Dataset/nhis_00001.xml")
df <- as.data.frame(read_ipums_micro(df_ddi, verbose = FALSE))
df %>% head(5)
## YEAR SERIAL STRATA PSU NHISHID REGION URBRRL PERNUM NHISPID
## 1 2020 1 150 25 0002020H000002 3 2 1 0002020H00000210
## 2 2020 2 111 10 0002020H000003 4 3 1 0002020H00000310
## 3 2020 3 133 3 0002020H000004 2 2 1 0002020H00000410
## 4 2020 4 139 45 0002020H000007 3 3 1 0002020H00000710
## 5 2020 5 130 21 0002020H000009 3 1 1 0002020H00000910
## HHX SAMPWEIGHT LONGWEIGHT PARTWEIGHT ASTATFLG CSTATFLG AGE SEX SEXORIEN
## 1 H000002 5946.002 17605.50 0.000 1 0 56 2 2
## 2 H000003 6288.726 0.00 5853.664 1 0 66 1 2
## 3 H000004 6083.271 0.00 5814.397 1 0 59 2 2
## 4 H000007 11306.962 0.00 10626.550 1 0 56 1 2
## 5 H000009 6471.818 19317.18 0.000 1 0 48 2 2
## MARSTAT MARST FAMSIZE PARTNEREMP FAMKIDNO RACEA ARMFEV EDUC SPOUSEDUC
## 1 10 11 3 0 1 100 11 501 3
## 2 10 11 2 0 0 100 11 501 10
## 3 10 11 2 0 0 100 11 502 9
## 4 30 30 1 0 0 100 11 103 0
## 5 99 99 4 0 1 100 98 400 0
## SCHOOLNOW EMPSTAT HOURSWRK PAIDSICK EMPHI EMPFT INCFAM07ON FAMTOTINC USUALPL
## 1 1 200 0 0 0 0 11 27000 2
## 2 1 100 40 2 2 2 24 250000 3
## 3 1 100 40 1 1 2 24 150000 2
## 4 1 999 0 0 0 0 22 54000 2
## 5 8 999 0 0 0 0 23 95000 2
## HINOTCOVE CVDDIAG CVDTEST CVDTESTRSLT
## 1 1 1 1 0
## 2 1 0 0 0
## 3 1 0 0 0
## 4 1 0 0 0
## 5 2 1 1 0
# B. Preliminary check on the data
# a. Review classes
sapply(df, class)
## $YEAR
## [1] "numeric"
##
## $SERIAL
## [1] "numeric"
##
## $STRATA
## [1] "haven_labelled" "vctrs_vctr" "double"
##
## $PSU
## [1] "haven_labelled" "vctrs_vctr" "double"
##
## $NHISHID
## [1] "character"
##
## $REGION
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $URBRRL
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $PERNUM
## [1] "numeric"
##
## $NHISPID
## [1] "character"
##
## $HHX
## [1] "character"
##
## $SAMPWEIGHT
## [1] "numeric"
##
## $LONGWEIGHT
## [1] "numeric"
##
## $PARTWEIGHT
## [1] "numeric"
##
## $ASTATFLG
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $CSTATFLG
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $AGE
## [1] "haven_labelled" "vctrs_vctr" "double"
##
## $SEX
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $SEXORIEN
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $MARSTAT
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $MARST
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $FAMSIZE
## [1] "haven_labelled" "vctrs_vctr" "double"
##
## $PARTNEREMP
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $FAMKIDNO
## [1] "haven_labelled" "vctrs_vctr" "double"
##
## $RACEA
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $ARMFEV
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $EDUC
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $SPOUSEDUC
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $SCHOOLNOW
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $EMPSTAT
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $HOURSWRK
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $PAIDSICK
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $EMPHI
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $EMPFT
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $INCFAM07ON
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $FAMTOTINC
## [1] "haven_labelled" "vctrs_vctr" "double"
##
## $USUALPL
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $HINOTCOVE
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $CVDDIAG
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $CVDTEST
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $CVDTESTRSLT
## [1] "haven_labelled" "vctrs_vctr" "integer"
# Note: some of the variables have attached data definitions "haven labeled"
# b. Check for missing values
sapply(df, function(x) sum(is.na(x)))
## YEAR SERIAL STRATA PSU NHISHID REGION
## 0 0 0 0 0 0
## URBRRL PERNUM NHISPID HHX SAMPWEIGHT LONGWEIGHT
## 0 0 0 0 0 0
## PARTWEIGHT ASTATFLG CSTATFLG AGE SEX SEXORIEN
## 0 0 0 0 0 0
## MARSTAT MARST FAMSIZE PARTNEREMP FAMKIDNO RACEA
## 0 0 0 0 0 0
## ARMFEV EDUC SPOUSEDUC SCHOOLNOW EMPSTAT HOURSWRK
## 0 0 0 0 0 0
## PAIDSICK EMPHI EMPFT INCFAM07ON FAMTOTINC USUALPL
## 0 0 0 0 0 0
## HINOTCOVE CVDDIAG CVDTEST CVDTESTRSLT
## 0 0 0 0
#c. Dropping variables that are not that important or provide no inisghts in the data
droppable_cols <- c("YEAR","SERIAL","STRATA","PSU","NHISHID","NHISPID","HHX","SAMPWEIGHT","LONGWEIGHT","PARTWEIGHT")
df <- df %>% dplyr::select(-all_of(droppable_cols))
colnames(df)
## [1] "REGION" "URBRRL" "PERNUM" "ASTATFLG" "CSTATFLG"
## [6] "AGE" "SEX" "SEXORIEN" "MARSTAT" "MARST"
## [11] "FAMSIZE" "PARTNEREMP" "FAMKIDNO" "RACEA" "ARMFEV"
## [16] "EDUC" "SPOUSEDUC" "SCHOOLNOW" "EMPSTAT" "HOURSWRK"
## [21] "PAIDSICK" "EMPHI" "EMPFT" "INCFAM07ON" "FAMTOTINC"
## [26] "USUALPL" "HINOTCOVE" "CVDDIAG" "CVDTEST" "CVDTESTRSLT"
# Feature Engineering variables
# REGION, URBRRL PERNUM ASTATFLG CSTATFLG AGE FAMKIDNO dont need any feature engineering yet
# a. Handling SEX AND SEXORIEN variable
# Combining all 'unknown categories' into one
unk <- c(7,8,9)
df$SEX[df$SEX %in% unk] = 9
# Combining Unknown SEXORIEN into "Something else" category
unk <- c(5,7,8)
df$SEXORIEN[df$SEXORIEN %in% unk] = 4
# b. Handling RACEA Variable
# Combining all 'unknown categories' into one
unk <- c(580, 900, 970, 980, 990)
df$RACEA[df$RACEA %in% unk] = 900
#c. Handling MARSTAT & MARST Variable
# Since both of the variables are indicative of same features it makes sense to keep on their current marital status for this analysis.
df <- df %>% select(-MARSTAT)
# Combining NIU and Unknown into one and combining all married labels into one
unk <- c(0,99)
df$MARST[df$MARST %in% unk] = 99
marr <- c(10,11,12,13)
df$MARST[df$MARST %in% marr] = 10
# d. Handling FAMSIZE
# Combine Unknowns into one
unk <- c(98,99)
df$FAMSIZE[df$FAMSIZE %in% unk] = 98
# e. Handling PARTNEREMP
unk <- c(7,8,9)
df$PARTNEREMP[df$PARTNEREMP %in% unk] = 9
# f. Handling ARMFEV by combining all unknowns
unk <- c(97,98,99)
df$ARMFEV[df$ARMFEV %in% unk] = 99
# g. Handling EDUC, SPOUSEDUC, SCHOOLNOW, by combining all unknowns
unk <- c(996,997,998,999)
df$EDUC[df$EDUC %in% unk] = 996
unk <- c(97,98,99)
df$SPOUSEDUC[df$SPOUSEDUC %in% unk] = 99
unk <- c(7,8,9)
df$SCHOOLNOW[df$SCHOOLNOW %in% unk] = 9
#h. Handling Employment Status
# Table below shows the distribution of employment categories
#Labels:
#value label
# 0 NIU
# 100 Employed
# 110 Working
# 111 Working for pay at job/business
# 112 Working, w/out pay, at job/business
# 120 With job, but not at work
# 121 With job, not at work: not laid-off, not looking
# 122 With job, not at work: looking
# 200 Not employed
# 210 Unemployed
# 211 Unemployed: On layoff
# 212 Unemployed: On layoff and looking
# 213 Unemployed: Unk if looking or laid off
# 214 Unemployed: Looking or on layoff
# 215 Unemployed: Have job to return to
# 216 Unemployed: Had job during the round
# 217 Unemployed: No job during reference period
# 220 Not in labor force
# 900 Unknown-all causes
# 997 Unknown-refused
# 998 Unknown-not ascertained
# 999 Unknown-don't know
# Combining all working into one, with job into one, Unemployed into one and unknown into one
work <- c(110,111,112)
df$EMPSTAT[df$EMPSTAT %in% work] = 110
w_job <- c(120,121,122)
df$EMPSTAT[df$EMPSTAT %in% w_job] = 120
unemployed <- c(200,210,211:217)
df$EMPSTAT[df$EMPSTAT %in% unemployed] = 200
unk <- c(997:999)
df$EMPSTAT[df$EMPSTAT %in% unk] = 999
# i. Handling HOURSWRK by replacing number of hours unknown into 0
unk <- c(0,97:99)
df$HOURSWRK[df$HOURSWRK %in% unk] = 0
# j. Handling all variables remaining
# Combined unknowns into one
df <- df %>% mutate(PAIDSICK = replace(PAIDSICK,PAIDSICK >4,9))
# Mutating EMPHI
df <- df %>% mutate(EMPHI = replace(EMPHI,EMPHI >4,9))
# Mutating USUALPL
# value label
# 0 NIU
# 1 There is no place or No
# 2 Yes, has a usual place or Yes
# 3 There is more than one place
# 7 Unknown-refused
# 8 Unknown-not ascertained
# 9 Unknown-don't know
# 2-3 combined as 2 and 7,8,9 combined as 3
df <- df %>%
mutate(USUALPL = replace(USUALPL, USUALPL == 3,2))
df <- df %>%
mutate(USUALPL = replace(USUALPL, USUALPL >= 7,9))
# Mutating HINOTCOVE
# Combine all unknowns into one
df <- df %>% mutate(HINOTCOVE = replace(HINOTCOVE,HINOTCOVE >4,9))
# Mutating CVDTEST
# Same transformation as above
df <- df %>% mutate(CVDTEST = replace(CVDTEST,CVDTEST >4,9))
# Mutating CVDDIAG
# Same transformation as above
df <- df %>% mutate(CVDDIAG = replace(CVDDIAG,CVDDIAG >4,9))
# Mutating CVDTESTRSLTS
#Labels:
# value label
# 0 NIU
# 1 No
# 2 Yes
# 3 Did not receive results
# 7 Unknown-refused
# 8 Unknown-not ascertained
# 9 Unknown-don't know
#Similar transformation where we combine all unknowns to 9
df <- df %>% mutate(CVDTESTRSLT = replace(CVDTESTRSLT,CVDTESTRSLT >4,9))
# A. Checking for health insurance coverage and covid relation
counts_hinc <- table(df$HINOTCOVE, df$CVDTESTRSLT)
rownames(counts_hinc) <- c('Has coverage','No coverage','Unknown')
colnames(counts_hinc) <- c('NIU', 'Negative','Positive',"Did Not Receive results",'Unknown')
barplot(counts_hinc[,2:3], col = rainbow(3:6),
main = 'Test Results based on Insurance Coverage', legend = rownames(counts_hinc)[1:2], xlab = 'COVID Test Result')
# B. Seeing the number of people who got tested given they have health insurance coverage from the company
sample_df <- df %>% group_by(CVDTEST = as.factor(CVDTEST),EMPHI = as.factor(EMPHI)) %>% dplyr::summarize(count = n())
## `summarise()` has grouped output by 'CVDTEST'. You can override using the `.groups` argument.
ggplot(sample_df, aes(fill=CVDTEST, y=count, x=CVDTEST)) +
geom_bar(position="dodge", stat="identity") +
ggtitle("People who tested when they get Health insurance coverage from the Company") +
facet_wrap(~EMPHI, ncol = 4, labeller = labeller(CVDTEST = c("0" = "NIU","1" = "Not Covered by Company", "2" = "Covered by Company", "9" = "Unknown" ))) +
theme(legend.position="none") +
xlab("Covid Test") + ylab("Frequency") + scale_x_discrete(labels = c("NIU","Negative","Positive","No Results")) + scale_fill_manual(values = c("#A6CEE3","#1F78B4","#B2DF8A","#33A02C")) + theme_minimal() + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 30))
# C. Seeing results between getting sick leave and covid result tests
counts_ps <- table(df$PAIDSICK, df$CVDTESTRSLT)
rownames(counts_ps) <- c('NIU','No sick leave','Sick leave', 'Unknown')
colnames(counts_ps) <- c('NIU', 'Negative','Positive', "Did Not Receive Results",'Unknown')
barplot(counts_ps[2:3,2:3], col = rainbow(2),
legend = rownames(counts_ps)[2:3], main = 'Test Results based on having Sick Leave', xlab = 'Testing Opportunity')
# D. Does having a usual place for medical care affect the testing
sample_df <- df %>% filter(USUALPL == 2)
sample_df <- sample_df %>% group_by(CVDTEST) %>% summarise(n = n())
sample_df %>% head()
## # A tibble: 4 x 2
## CVDTEST n
## <int+lbl> <int>
## 1 0 [NIU] 16418
## 2 1 [No] 12956
## 3 2 [Yes] 5236
## 4 9 [Unknown-don't know] 33
fig <- plot_ly(sample_df, x = ~CVDTEST, y = ~n, type = 'bar',
marker = list(color = c('rgba(204,204,204,1)', 'rgba(204,204,204,1)', 'rgba(222,45,38,0.8)','rgba(204,204,204,1)')))
fig <- fig %>% layout(title = "Seeing if having Usual Place of medical care encourages Covid Testing",
xaxis = list(title = ""),
yaxis = list(title = ""))
fig
How does socioeconomic status affect COVID-19 infection status?
# A. Creating Stratified Sample
table(df$CVDDIAG)
##
## 0 1 2 9
## 17649 18954 671 84
strat <- stratified(df, group = 'CVDDIAG', size = 1500)
## Groups 2, 9 contain fewer rows than requested. Returning all rows.
# Dropping the NIUs & Unknown as the provide no insights into the data
strat <- strat %>% filter(CVDDIAG != 0)
strat <- strat %>% filter(CVDDIAG != 9)
strat$CVDDIAG[strat$CVDDIAG == 1] = 0
strat$CVDDIAG[strat$CVDDIAG == 2] = 1
# Seeing the distribution of the dataset
table(strat$CVDDIAG)/length(strat$CVDDIAG)
##
## 0 1
## 0.6909258 0.3090742
# Visualising it
barplot(table(strat$CVDDIAG))
# Fitting a logi
What are the driving factors behind infections?
What type of working conditions impacts the ability to get tested for COVID?